5. R code for plotting and statistics

# =======================================
#       Load required libraries
# =======================================
library(data.table)
library(dplyr)
library(ggplot2)
library(ggsci)
library(ggthemes)
library(ggpubr)
library(ggExtra)
library(RColorBrewer)
library(tidyr)
library(viridis)
library(gridExtra)
library(cowplot)
library(phytools)
library(ggtree)
library(ape)
library(patchwork)

# Fonts
library(extrafont)
library(fontcm)
#font_import(prompt = FALSE)
#font_install('fontcm')

# Summary statistics
library(doBy)

# =======================================
#           Set working directory
# =======================================
setwd("~/work/sylvietta_PAR/results/")

# Avoid scientific notation in plots/tables
options(scipen = 999)

# =======================================
#        Exon completeness dataset
# =======================================

# Load Sylvioidea exon data
PAR.genes.bed <- fread("datasets/allSp.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(PAR.genes.bed) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes.bed$species <- sub("_1EV02922", "", PAR.genes.bed$species)
PAR.genes.bed <- subset(PAR.genes.bed, c(species != "AlaRaz" & species != "GalMod"))

# Load Zebra Finch PAR data and merge
ZF.PAR <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes.bed <- rbind(PAR.genes.bed, ZF.PAR)

# Load Zebra Finch genomic positions (on chrZ)
ZF.PAR.genes.bed <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR.genes.bed) <- c("chr", "chr_start", "chr_end", "Gene", "Trans", "exon_nr", "ZF")

# Merge Sylvioidea and ZF data by gene ID
PAR.genes.bed <- merge(PAR.genes.bed, ZF.PAR.genes.bed, by = c("Trans", "Gene", "exon_nr"))

# Calculate exon completeness (as % of reference length)
PAR.genes.bed$perc_found <- (PAR.genes.bed$contig_end - PAR.genes.bed$contig_start) /
  (PAR.genes.bed$chr_end - PAR.genes.bed$chr_start)

# Add gene names
geneName <- fread("datasets/PAR_genes_geneID_transID_geneName.list", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(geneName) <- c("name", "Gene", "Trans")
PAR.genes.bed <- merge(PAR.genes.bed, geneName, by = c("Trans", "Gene"))

# =======================================
#         Plot exon completeness
# =======================================

pdf("~/work/PAR/results/plots/allSp_exon_completeness.pdf", height = 10, width = 20)
ggplot(PAR.genes.bed, aes(y = species, x = exon_nr, fill = as.factor(round(perc_found, 1)))) + 
  geom_point(pch = 22, size = 3) +
  facet_wrap(~name) +
  theme_tufte() +
  scale_fill_startrek()
dev.off()

# =======================================
#            Z Chromosome Analyses
# =======================================

# Load gene positions for ZF chrZ
ZF.Z.genes.bed <- fread("datasets/ZF.Z.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.Z.genes.bed) <- c("chr", "chr_start", "chr_end", "Gene", "Trans", "exon_nr", "ZF")

# Start position for each gene (used to sort/annotate later)
ZF.Z.genes.startPos <- aggregate(chr_start ~ chr + Gene + Trans, data = ZF.Z.genes.bed, FUN = max)

# Get list of PAR genes from ZF data
ZF.PAR <- fread("datasets/ZF.PAR.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(ZF.PAR) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
PAR.genes <- unique(ZF.PAR$Gene)

# Load Sylvioidea data for all Z genes 
allSp.Z.genes.bed <- fread("datasets/allSp.Z.genes.bed", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(allSp.Z.genes.bed) <- c("contig", "contig_start", "contig_end", "Gene", "Trans", "exon_nr", "species")
allSp.Z.genes.bed <- subset(allSp.Z.genes.bed, c(species != "AlaRaz" & species != "GalMod"))

# Load sample information for Sylvioidea species
samples.sex <- fread("datasets/samples_sex.tsv", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(samples.sex) <- c("indv", "species", "sex")
samples.sex <- subset(samples.sex, c(species != "AlaRaz" & species != "GalMod"))

# List of all Z-linked genes per species (used for filling in singleton matrix)
fillInMissing <- fread("datasets/Z.genes.allSp.list", fill = TRUE, header = FALSE, stringsAsFactors = FALSE)
colnames(fillInMissing) <- c("chr", "Gene", "Trans", "species")
fillInMissing <- merge(fillInMissing, ZF.Z.genes.startPos, by = c("chr", "Gene", "Trans"))

# =======================================
#       Load gene trees for analysis
# =======================================

setwd("~/work/PAR/mafft_ZF/")

# Load trees
LMAN1  <- read.tree("LMAN1.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized2 <- read.tree("uncharacterized2.realn.fasta.auto.gt.0.8.trim.treefile")
RAX    <- read.tree("RAX.realn.fasta.auto.gt.0.8.trim.treefile")
GRP    <- read.tree("GRP.realn.fasta.auto.gt.0.8.trim.treefile")
SEC11C <- read.tree("SEC11C.realn.fasta.auto.gt.0.8.trim.treefile")
ZNF532 <- read.tree("ZNF532.realn.fasta.auto.gt.0.8.trim.treefile")
MALT1  <- read.tree("MALT1.realn.fasta.auto.gt.0.8.trim.treefile")
ALPK2  <- read.tree("ALPK2.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized3 <- read.tree("uncharacterized3.realn.fasta.auto.gt.0.8.selectseqs.trim.treefile")
NEDD4L <- read.tree("NEDD4L.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized4 <- read.tree("uncharacterized4.realn.fasta.auto.gt.0.8.selectseqs.trim.treefile")
ATP8B1 <- read.tree("ATP8B1.realn.fasta.auto.gt.0.8.trim.treefile")
NARS1  <- read.tree("NARS1.realn.fasta.auto.gt.0.8.trim.treefile")
FECH   <- read.tree("FECH.realn.fasta.auto.gt.0.8.trim.treefile")
ST8SIA3 <- read.tree("ST8SIA3.realn.fasta.auto.gt.0.8.trim.treefile")
uncharacterized6 <- read.tree("uncharacterized6.realn.fasta.auto.gt.0.8.trim.treefile")
WDR7   <- read.tree("WDR7.realn.fasta.auto.gt.0.8.trim.treefile")

# Quick check of one tree
plot(WDR7)

# =======================================
#   Extract pairwise cophenetic distances
# =======================================

# Helper function to extract distances with gene label
extract_distances <- function(tree, gene_name) {
  dist_df <- melt(cophenetic.phylo(tree))
  dist_df$Gene <- gene_name
  return(dist_df)
}

# Create distance data for each gene
dist.list <- list(
  extract_distances(RAX, "RAX"),
  extract_distances(ATP8B1, "ATP8B1"),
  extract_distances(LMAN1, "LMAN1"),
  extract_distances(uncharacterized2, "uncharacterized2"),
  extract_distances(GRP, "GRP"),
  extract_distances(SEC11C, "SEC11C"),
  extract_distances(ZNF532, "ZNF532"),
  extract_distances(MALT1, "MALT1"),
  extract_distances(ALPK2, "ALPK2"),
  extract_distances(uncharacterized3, "uncharacterized3"),
  extract_distances(NEDD4L, "NEDD4L"),
  extract_distances(uncharacterized4, "uncharacterized4"),
  extract_distances(NARS1, "NARS1"),
  extract_distances(ST8SIA3, "ST8SIA3"),
  extract_distances(uncharacterized6, "uncharacterized6"),
  extract_distances(WDR7, "WDR7"),
  extract_distances(FECH, "FECH")
)

# Combine all into a single data frame
dist.data <- rbindlist(dist.list)

# Remove self-self comparisons
dist.data <- subset(dist.data, Var1 != Var2)

# Extract species names from sample IDs
dist.data <- dist.data %>%
  separate(Var1, into = c("Species1", "Species1_sex"), sep = "_") %>%
  separate(Var2, into = c("Species2", "Species2_sex"), sep = "_")

# =======================================
#     Filter for intraspecific pairs
# =======================================

# Keep only comparisons between individuals of the same species
sp.comp.dist.data <- dist.data %>%
  filter(Species1 == Species2) %>%
  select(Species1, Gene, value)

# Rename and order species factor
sp.comp.dist.data$Species1 <- sub("HirDau", "CecDau", sp.comp.dist.data$Species1)
sp.comp.dist.data$Species1 <- sub("TurAlt", "ArgAlt", sp.comp.dist.data$Species1)
sp.comp.dist.data$Species1 <- factor(sp.comp.dist.data$Species1, levels = c(
  "TaeGut", "FicAlb", "CetCet", "AegCau", "PhyCol", "PycBar", "SylAtr", "ArgAlt",
  "CecDau", "AcrSch", "LocLus", "CisJun", "SylBra", "PanBia", "EreAlp", "AlaArv"
))

sp.comp.dist.data <- unique(sp.comp.dist.data)
# =======================================
#        Plot heatmap of distances
# =======================================

pdf("~/work/PAR/results/plots/PAR_heatmap.pdf")
ggplot(sp.comp.dist.data, aes(Gene, Species1, fill = value)) +
  geom_tile(color = "black") +
  scale_fill_distiller(palette = "YlOrRd", direction = 1) +
  theme_tufte() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, size = 14),
    axis.text.y = element_text(size = 14)
  ) +
  labs(title = "", x = "", y = "", fill = "")
dev.off()

# =======================================
#    Annotate species with fusion type
# =======================================

sp.comp.dist.data$type <- "Z + 4A"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("AlaArv")] <- "Z + 4A + 3 + 5"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("EreAlp", "PanBia")] <- "Z + 4A + 3"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("CisJun")] <- "Z + 4A + 4"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("SylBra")] <- "Z + 4A + 8"
sp.comp.dist.data$type[sp.comp.dist.data$Species1 %in% c("TaeGut", "FicAlb")] <- "Z"

# =======================================
#     Summary statistics by group
# =======================================

summary_stats <- summaryBy(value ~ type + Species1, data = sp.comp.dist.data,
                           keep.names = TRUE, FUN = c(median, sd, IQR))
print(summary_stats)

# =======================================
#      Statistical comparison (vs Zebra finch)
# =======================================

reference_species <- "TaeGut"
ref_data <- filter(sp.comp.dist.data, Species1 == reference_species)

ref_data %>%
  group_by(Species1) %>%
  summarise(
    n            = sum(!is.na(value)),
    median_value = median(value, na.rm = TRUE),
    sd_value     = sd(value, na.rm = TRUE),
    iqr_value    = IQR(value, na.rm = TRUE))



results <- sp.comp.dist.data %>%
  filter(Species1 != reference_species) %>%
  group_by(Species1) %>%
  summarise(
    n            = sum(!is.na(value)),
    median_value = median(value, na.rm = TRUE),
    sd_value     = sd(value, na.rm = TRUE),
    iqr_value    = IQR(value, na.rm = TRUE),
    p_value = if (n() > 1) {
      
      wilcox.test(value, ref_data$value)$p.value
    } else {
      NA
    },
    W_statistic = if (n() > 1) {
      wilcox.test(value, ref_data$value)$statistic
    } else {
      NA
    },
    higher_than_ref = median(value, na.rm = TRUE) > median(ref_data$value, na.rm = TRUE)
  ) %>%
  mutate(
    bonferroni_p = p.adjust(p_value, method = "bonferroni"),
    bonferroni_p_formatted = format(bonferroni_p, digits = 4, scientific = FALSE)
  )

print(results)

print(results)
write.table(results, file = "~/work/PAR/results/tables/pairwise_distance_summary.tsv", sep = "\t", row.names = FALSE, quote = FALSE)


# =======================================
#       Add gene order by position
# =======================================

# Get gene start positions (one per gene)
PAR.genes.bed.small <- PAR.genes.bed %>%
  select(name, chr_start) %>%
  distinct() %>%
  group_by(name) %>%
  summarise(chr_start = min(chr_start)) %>%
  arrange(chr_start) %>%
  mutate(order = row_number())

# Merge position/order into distance data
sp.comp.dist.data.pos <- merge(sp.comp.dist.data, PAR.genes.bed.small,
                               by.x = "Gene", by.y = "name")
sp.comp.dist.data.pos <- sp.comp.dist.data.pos %>%
  arrange(order) %>%
  mutate(rownr = row_number())

write.table(sp.comp.dist.data.pos, file = "~/work/PAR/results/tables/pairwise_distance_per_gene.tsv", sep = "\t", row.names = FALSE, quote = FALSE)


# =======================================
#        Plot flipped PAR heatmap
# =======================================

setwd("~/work/PAR/results/plots")

pdf("PAR_heatmap_flip.pdf", height = 10, width = 18)
ggplot(
  subset(sp.comp.dist.data.pos, Species1 != "GalMod"),
  aes(Species1, reorder(Gene, order), fill = value)
) +
  geom_tile(color = "black", size = 0.2) +
  scale_fill_viridis(direction = -1, option = "viridis") +
  theme_tufte(base_size = 16) +
  theme(
    axis.text.x = element_text(size = 24, angle = 90, hjust = 1),
    axis.text.y = element_text(size = 24)
  ) +
  labs(title = " ", x = "", y = "", fill = "")
dev.off()

# Embed fonts for publication (for journal compatibility)
embed_fonts("PAR_heatmap_flip.pdf", outfile = "PAR_heatmap_flip_font.pdf")

# =======================================
#     Boxplot by species (with colors)
# =======================================

myColors <- ifelse(levels(sp.comp.dist.data$Species1)=="AlaArv" , rgb(163/255,213/255,91/255) , 
                   ifelse(levels(sp.comp.dist.data$Species1)=="CisJun", rgb(163/255,213/255,91/255),
                          ifelse(levels(sp.comp.dist.data$Species1)=="SylBra", rgb(163/255,213/255,91/255),
                                 "gold" ) ))

pdf("pairwise_distance_fusion_types3_new.pdf", width = 10, height = 5)
boxplot(value ~ Species1, data = sp.comp.dist.data,
        notch = FALSE,
        col = myColors,
        main = " ", xlab = " ")
dev.off()

# Embed for publication
embed_fonts("pairwise_distance_fusion_types3_new.pdf",
            outfile = "pairwise_distance_fusion_types3_new_font.pdf")

# =======================================
#      Fusion Category Comparisons
# =======================================

# Extra fusion group
extra.fusion <- c("AlaArv", "PanBia", "EreAlp", "SylBra", "CisJun")
extra.sc.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, Species1 %in% extra.fusion) %>%
  mutate(type = "Extra fusion")

# No extra fusion group
no.extra.sc.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, !Species1 %in% extra.fusion) %>%
  mutate(type = "No extra fusion")

# Sex-linked PAR group
sex_linked_species <- c("AlaArv", "SylBra", "CisJun")
sex_linked_PAR.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, Species1 %in% sex_linked_species) %>%
  mutate(PAR_type = "Extra fusion")

# Autosomal PAR group
autosomal_PAR.sp.comp.dist.data.pos <- filter(sp.comp.dist.data.pos, !Species1 %in% sex_linked_species) %>%
  mutate(PAR_type = "No extra fusion")

# =======================================
#      Annotate species full names
# =======================================

sp.comp.dist.data.pos$species_name <- "null"

sp.comp.dist.data.pos <- sp.comp.dist.data.pos %>%
  mutate(species_name = recode(Species1,
                               "TaeGut" = "Taeniopygia guttata (Group I: Z)",
                               "AlaArv" = "Alauda arvensis (Group VII: Z;4A,3,5)*",
                               "EreAlp" = "Eremophila alpestris (Group VI: Z;4A,3)",
                               "PanBia" = "Panurus biarmicus  (Group V: Z;4A;3;5)",
                               "SylBra" = "Sylvietta brachyura (Group IV: Z;4A;8)*",
                               "AegCau" = "Aegithalos caudatus (Group II: Z;4A)",
                               "AcrSch" = "Acrocephalus schoenobaenus (Group II: Z;4A)",
                               "CetCet" = "Cettia cetti (Group II: Z;4A)",
                               "CisJun" = "Cisticola juncidis (Group III: Z;4A;4)*",
                               "PhyCol" = "Phylloscopus collybita (Group II: Z;4A)",
                               "CecDau" = "Cecropis daurica (Group II: Z;4A)",
                               "FicAlb" = "Ficedula albicollis (Group I: Z)",
                               "PycBar" = "Pycnonotus barbatus (Group II: Z;4A)",
                               "SylAtr" = "Sylvia atricapilla (Group II: Z;4A)",
                               "LocLus" = "Locustella luscinioides (Group II: Z;4A)",
                               "ArgAlt" = "Argya altirostris (Group II: Z;4A)"
  ))

# =======================================
#    Faceted Spearman Correlation Plot
# =======================================

setwd("~/work/PAR/results")

# Prepare chromosome start position (Mb)
sp.comp.dist.data.pos$chr_start_mb <- sp.comp.dist.data.pos$chr_start / 1e6

# Sort unique species names
sp_name <- unique(sp.comp.dist.data.pos$species_name)

# Define color palette
cols <- viridis(17, alpha = 1, begin = 0, end = 1, option = "D")

# Helper function for axis formatting
scaleFUN <- function(x) sprintf("%.3f", x)
scaleFUN2 <- function(x) sprintf("%.1f", x)

cor_stats <- data.frame(
  species = character(),
  rho = numeric(),
  p_value = numeric(),
  N = integer(),
  stringsAsFactors = FALSE
)

plot_list <- list()

for (i in seq_along(sp_name)) {
  data_i <- filter(sp.comp.dist.data.pos, species_name == sp_name[i])
  y_limit <- max(data_i$value, na.rm = TRUE) + 0.001
  n_value <- sum(!is.na(data_i$value))
  
  # Compute Spearman test
  cor_test <- cor.test(data_i$chr_start_mb, data_i$value, method = "spearman", exact = FALSE)
  #rho_n_label <- sprintf("ρ = %.2f; p = %.3g; N = %d", cor_test$estimate, cor_test$p.value, n_value)
  rho_n_label <- bquote(italic(rho) == .(round(cor_test$estimate, 2)) * "; p = " * .(formatC(cor_test$p.value, format = "g", digits = 3)) * "; N = " * .(n_value))
  
  # Determine whether to add regression line
  reg_line <- if (cor_test$p.value < 0.05) "reg.line" else "none"
  
  p <- ggscatter(
    data_i, x = "chr_start_mb", y = "value",
    size = 5, shape = 21,
    fill = "#6A9F33",  # slightly darker green for clarity
    add = reg_line, conf.int = FALSE,
    repel = TRUE,
    title = sp_name[i],
    xlab = "Z chromosome position (Mb)", ylab = "Female-male pairwise distance",
    show.legend.text = FALSE, show.legend = FALSE,
    cor.coef = FALSE
  ) +
    annotate("text", x = 0.05, y = y_limit*0.9, label = rho_n_label,
             size = 6, color = "black", fontface = "italic", family = "Helvetica", hjust = 0) +
    scale_y_continuous(labels = scaleFUN) +
    scale_x_continuous(labels = scaleFUN2) +
    theme_minimal(base_family = "Helvetica") +
    theme(
      plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
      axis.text = element_text(size = 16, colour = "black"),
      axis.title = element_text(size = 14),
      panel.border = element_rect(color = "black", fill = NA, linewidth = 0.8)
    )
  
  cor_stats <- rbind(
    cor_stats,
    data.frame(
      species = as.character(sp_name[i]),
      rho = as.numeric(cor_test$estimate),
      p_value = cor_test$p.value,
      N = n_value,
      stringsAsFactors = FALSE
    )
  )
  plot_list[[i]] <- p
}

plot_list[[6]] <- plot_list[[6]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[7]] <- plot_list[[7]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  
plot_list[[1]] <- plot_list[[1]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())

plot_list[[9]] <- plot_list[[9]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[8]] <- plot_list[[8]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label
plot_list[[11]] <- plot_list[[11]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label

plot_list[[15]] <- plot_list[[15]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[13]] <- plot_list[[13]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label
plot_list[[12]] <- plot_list[[12]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label

plot_list[[5]] <- plot_list[[5]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[4]] <- plot_list[[4]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label

# Define each row
row1 <- plot_list[[6]] + plot_list[[7]] + plot_list[[1]] + plot_layout(nrow = 1, guides = "collect", tag_level = "keep")
row2 <- plot_list[[9]] + plot_list[[8]] + plot_list[[11]] + plot_layout(nrow = 1, guides = "collect")
row3 <- plot_list[[15]] + plot_list[[13]] + plot_list[[12]] + plot_layout(nrow = 1, guides = "collect")
row4 <- plot_list[[5]] + plot_list[[4]] + plot_spacer() + plot_layout(nrow = 1, widths = c(1, 1, 1.14)) 

# Combine all rows
final_plot <- (row1 / row2 / row3 / row4) + 
  plot_layout(
    guides = "collect", 
    widths = c(1, 1, 1)  # Adjust the widths to control the layout of the first three columns
  ) &
  theme(
    legend.position = "none", 
    plot.margin = margin(0, 0, 0, 0), # Set equal margins
    panel.spacing = unit(0.5, "cm") # Ensure consistent spacing between plots
    )

final_plot <- final_plot + 
  plot_annotation(tag_levels = 'a') & 
  theme(plot.tag = element_text(size = 20, face = "bold"))

ggsave("~/work/PAR/results/plots/Figure5_combined.pdf", final_plot, width = 18, height = 16)


plot_list[1] # AcrSch
plot_list[2] # SylBra
plot_list[3] # AlaArv
plot_list[4] # EreAlp
plot_list[5] # PanBia
plot_list[6] # TaeGut
plot_list[7] # FicAlb
plot_list[8] # AegCau
plot_list[9] # CetCet
plot_list[10] # CisJun
plot_list[11] # PhyCol
plot_list[12] # LocLus
plot_list[13] # CecDau
plot_list[14] # SylAtr
plot_list[15] # ArgAlt
plot_list[16] # PycBar


plot_list[[16]] <- plot_list[[16]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[14]] <- plot_list[[14]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label
plot_list[[10]] <- plot_list[[10]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label
plot_list[[2]] <- plot_list[[2]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_text(size = 14))
plot_list[[3]] <- plot_list[[3]] + theme(axis.text.y = element_text(size = 16), axis.title.y = element_blank())  # Hide the label


# Define each row
Srow1 <- plot_list[[16]] + plot_list[[14]] + plot_list[[10]] + plot_layout(nrow = 1, guides = "collect", tag_level = "keep")
Srow2 <- plot_list[[2]] + plot_list[[3]] + plot_spacer() + plot_layout(nrow = 1, widths = c(1, 1, 1.14)) 

# Combine all rows
final_plot <- (Srow1 / Srow2) + 
  plot_layout(
    guides = "collect", 
    widths = c(1, 1, 1)  # Adjust the widths to control the layout of the first three columns
  ) &
  theme(
    legend.position = "none", 
    plot.margin = margin(0, 0, 0, 0), # Set equal margins
    panel.spacing = unit(0.5, "cm") # Ensure consistent spacing between plots
  )

final_plot <- final_plot + 
  plot_annotation(tag_levels = 'a') & 
  theme(plot.tag = element_text(size = 20, face = "bold"))

ggsave("~/work/PAR/results/plots/FigureS12_combined.pdf", final_plot, width = 18, height = 8)


setwd("~/work/PAR/results/plots")
pdf("pairwise_distance_spearman.allSp.pdf", width= 14, height = 12)
ggscatter(sp.comp.dist.data.pos, x = "chr_start", y = "value", fill = "Species1", 
          size = 3, shape = 21, facet.by = "species_name", combine = T,xlab = "Z chromosome position", 
          add = "reg.line", conf.int = F,ylab = "pairwise distance", repel = T,
          add.params = list(color = "Species1", fill = "grey", size = 0.75),
          title = NULL, show.legend.text = FALSE, 
          cor.coef = TRUE,
          scales = "free_y",
          cor.coeff.args = list(method = "spearman", cor.coef.name = "rho"))
dev.off()
embed_fonts("pairwise_distance_spearman.allSp.pdf", outfile="pairwise_distance_spearman.allSp_font.pdf")


# =======================================
#    Private allele stats and plots
# =======================================

Z.singletons=read.table("~/work/sylvietta_PAR/results/PAR_FicAlb/allSp.Z.genes.singletons.S.sum.krakenSNPs.out",header=FALSE,fill=TRUE,stringsAsFactor=FALSE)
Z.singletons <- plyr::rename(Z.singletons, c("V1"="nr.singletons", "V2"="Gene","V3"="indv", "V4"="species", "V5"="sex"))
Z.singletons <- subset(Z.singletons, c(species != "AlaRaz" & species != "GalMod"))
Z.singletons$species <- sub("HirDau", "CecDau", Z.singletons$species)
Z.singletons$species <- sub("TurAlt", "ArgAlt", Z.singletons$species)
Z.singletons$species <- sub("\\_1EV02922", "", Z.singletons$species)
specieslist <- unique(Z.singletons$species)

samples.sex$species <- sub("HirDau", "CecDau", samples.sex$species)
samples.sex$species <- sub("TurAlt", "ArgAlt", samples.sex$species)
samples.sex$species <- sub("\\_1EV02922", "", samples.sex$species)

samples.sex.15 <- samples.sex[ samples.sex$species %in% specieslist, ]


samples.Z.gene.combination <- crossing(ZF.Z.genes.startPos, samples.sex.15)

Z.singletons <- merge(Z.singletons, samples.Z.gene.combination, all=TRUE, by=c("Gene", "sex", "indv", "species"))

Z.singletons[is.na(Z.singletons)] <- 0
PAR.genes.bed.small2 <- unique(PAR.genes.bed %>% select(Gene, name, chr_start))
PAR.genes.bed.small2 <- aggregate(chr_start ~ name + Gene, PAR.genes.bed.small2, function(x) min(x))
PAR.genes.bed.small2 <- PAR.genes.bed.small2[order(PAR.genes.bed.small2$chr_start),] 
PAR.genes.bed.small2$order <- seq.int(nrow(PAR.genes.bed.small2))
Z.singletons.PAR <- merge(Z.singletons, PAR.genes.bed.small2, by=c("Gene"))
Z.singletons.PAR$species <- ordered(Z.singletons.PAR$species,
                                    levels = c("TaeGut", "FicAlb", "CetCet", "AegCau", "PhyCol", "PycBar", "SylAtr", "ArgAlt", "CecDau", "LocLus", "AcrSch", "CisJun", 
                                               "SylBra", "PanBia", "EreAlp", "AlaArv"))

sc.genes2 <- unique(sp.comp.dist.data.pos.symbol$Gene)

Z.singletons.PAR <- Z.singletons.PAR[ Z.singletons.PAR$name %in% sc.genes2, ]

pdf("~/work/PAR/results/plots/PAR_heatmap_singletons_per_sex.pdf", height = 10, width =10)
ggplot(Z.singletons.PAR, aes(reorder(name, order),species, fill= nr.singletons)) + 
  geom_tile( colour = "black", size = 0.5)  +  scale_fill_viridis(direction = -1, option = "viridis") +
  theme_tufte(base_family="Helvetica", base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme(axis.text.x=element_text(size=18, angle=90), axis.text.y=element_text(size=18)) + labs(title="Per sex number of singletons", x="", y="", fill="") + facet_wrap(~sex, ncol = 1)
dev.off()

Z.singletons.PAR_wide <- dcast(Z.singletons.PAR, species + Gene + Trans + order + name ~ sex, value.var="nr.singletons")

pdf("~/work/PAR/results/PAR_heatmap_singletons_sex_diff.pdf", height = 10, width =10)
ggplot(Z.singletons.PAR_wide, aes(reorder(name, order),species, fill= female-male)) + 
  geom_tile( colour = "black", size = 0.5)  +  scale_fill_viridis(direction = -1, option = "viridis") +
  theme_tufte(base_size = 18, base_family = "serif")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  theme(axis.text.x=element_text(size=18, angle=90), axis.text.y=element_text(size=18)) + labs(title="Female-male difference in singletons", x="", y="", fill="") 
dev.off()

### Wilcoxon tests for singletons 
results <- Z.singletons.PAR_wide %>%
  group_by(species) %>%
  summarise(
    p_value = if (n() > 1) {
      wilcox.test(female, male, paired = TRUE)$p.value
    } else {
      NA  # Not enough data for the test
    },
    female_higher = mean(female > male)  # Proportion of cases where female > male
  )

results <- Z.singletons.PAR_wide %>%
  group_by(species) %>%
  summarise(
    median_female = median(female, na.rm = TRUE),  # Mean value for females
    median_male = median(male, na.rm = TRUE),      # Mean value for males
    n_male            = sum(!is.na(male)),
    sd_male     = sd(male, na.rm = TRUE),
    iqr_male    = IQR(male, na.rm = TRUE),
    n_female            = sum(!is.na(female)),
    sd_female     = sd(female, na.rm = TRUE),
    iqr_female    = IQR(female, na.rm = TRUE),
    p_value = if (n() > 1) {
      wilcox.test(female, male, paired = TRUE)$p.value
    } else {
      NA  # Not enough data for the test
    },
    W_statistic = if (n() > 1) {
      wilcox.test(female, male, paired = TRUE)$statistic
    } else {
      NA
    },
    female_higher = mean(female > male, na.rm = TRUE)  # Proportion of cases where female > male
  ) %>%
  mutate(
    bonferroni_p = p.adjust(p_value, method = "bonferroni")
  )

outname <- sprintf("~/work/PAR/results/tables/singleton_wilcoxon.tsv")
write.table(results, file = outname, sep = "\t", quote = FALSE, row.names = F)


# =======================================
#    Read depth stats and plots
# =======================================

# Depth stats
Z.depth=read.table("~/work/sylvietta_PAR/results/datasets/allSp.Z.genes.bamstat04.geneInfo.out",header=FALSE,fill=TRUE,stringsAsFactor=FALSE)
Z.depth <- plyr::rename(Z.depth, c("V1"="indv", "V2"="min.depth","V3"="max.depth","V4"="avg.depth", "V5"="median.depth","V6"="species","V7"="Gene","V8"="Trans"))
Z.depth <- subset(Z.depth, c(Z.depth$species!="AlaRaz" & Z.depth$species!="GalMod"))
Z.depth$species <- sub("HirDau", "CecDau", Z.depth$species)
Z.depth$species <- sub("TurAlt", "ArgAlt", Z.depth$species)
Z.depth$species <- sub("\\_1EV02922", "", Z.depth$species)

Z.depth.mean <- aggregate(avg.depth ~ species + Gene + indv + Trans + indv, Z.depth, mean)
Z.depth.mean.combination <- merge(Z.depth.mean, samples.Z.gene.combination)
Z.depth.mean.combination_wide <- dcast(Z.depth.mean.combination, species + Gene + Trans + chr_start ~ sex, value.var="avg.depth")

Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$female>5 & Z.depth.mean.combination_wide$female<80))
Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$male>5 & Z.depth.mean.combination_wide$male<80))
aggregate(male ~ species, data = Z.depth.mean.combination_wide, mean)

### Calculate coverage ratio and remove outliers
Z.depth.mean.combination_wide$cov_ratio <- Z.depth.mean.combination_wide$female/Z.depth.mean.combination_wide$male
Z.depth.mean.combination_wide <- subset(Z.depth.mean.combination_wide, c(Z.depth.mean.combination_wide$cov_ratio < 2.5))
Z.depth.mean.combination_wide.median <- aggregate(cov_ratio ~ species, data = Z.depth.mean.combination_wide, median)
Z.depth.mean.combination_wide.median <- plyr::rename(Z.depth.mean.combination_wide.median, c("cov_ratio"="species_median"))
Z.depth.mean.combination_wide <- merge(Z.depth.mean.combination_wide, Z.depth.mean.combination_wide.median, by=c("species"))
Z.depth.mean.combination_wide$cov_ratio_scaled <- Z.depth.mean.combination_wide$cov_ratio / Z.depth.mean.combination_wide$species_median
Z.depth.mean.combination_wide <- na.omit(Z.depth.mean.combination_wide) 
Z.depth.mean.combination_wide$cov_ratio_scaled <- Z.depth.mean.combination_wide$cov_ratio_scaled - 0.5

Z.depth.mean.combination_wide.PAR <- merge(Z.depth.mean.combination_wide, PAR.genes.bed.small2, by=c("Gene"))

Z.depth.mean.combination_wide.PAR <- Z.depth.mean.combination_wide.PAR[ Z.depth.mean.combination_wide.PAR$name %in% sc.genes2, ]
#Z.depth.mean.combination_wide.PAR$species <- sub("HirDau", "CecDau", Z.depth.mean.combination_wide.PAR$species)

Z.depth.mean.combination_wide$species_name <- "null"

Z.depth.mean.combination_wide <- Z.depth.mean.combination_wide %>%
  mutate(species_name = recode(species,
                               "TaeGut" = "Taeniopygia guttata (Group I: Z)",
                               "AlaArv" = "Alauda arvensis (Group VII: Z;4A,3,5)*",
                               "EreAlp" = "Eremophila alpestris (Group VI: Z;4A,3)",
                               "PanBia" = "Panurus biarmicus  (Group V: Z;4A;3;5)",
                               "SylBra" = "Sylvietta brachyura (Group IV: Z;4A;8)*",
                               "AegCau" = "Aegithalos caudatus (Group II: Z;4A)",
                               "AcrSch" = "Acrocephalus schoenobaenus (Group II: Z;4A)",
                               "CetCet" = "Cettia cetti (Group II: Z;4A)",
                               "CisJun" = "Cisticola juncidis (Group III: Z;4A;4)*",
                               "PhyCol" = "Phylloscopus collybita (Group II: Z;4A)",
                               "CecDau" = "Cecropis daurica (Group II: Z;4A)",
                               "FicAlb" = "Ficedula albicollis (Group I: Z)",
                               "PycBar" = "Pycnonotus barbatus (Group II: Z;4A)",
                               "SylAtr" = "Sylvia atricapilla (Group II: Z;4A)",
                               "LocLus" = "Locustella luscinioides (Group II: Z;4A)",
                               "ArgAlt" = "Argya altirostris (Group II: Z;4A)"
  ))

options(scipen=999)
pdf("~/work/PAR/results/plots/FigureS1_PAR_genes_depth.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start/1000000, y = cov_ratio_scaled, fill = species_name)) +
  geom_rect(mapping=aes(xmin=0, xmax=0.450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
  geom_point(pch = 21) + facet_wrap(~species_name) + theme_minimal() +geom_hline(yintercept = 0.5) + 
  labs(x = "Z chromosome position (Mb)", y = "Female-to-male genome coverage ratio") + 
  theme_bw(base_family="Helvetica", base_size = 12) + theme(legend.position="none") 
dev.off()

pdf("~/work/PAR/results/plots/PAR_genes_depth.female.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start, y = female, fill = species)) +
  geom_rect(mapping=aes(xmin=0, xmax=450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
  geom_point(pch = 21) + facet_wrap(~species) + theme_minimal() +geom_hline(yintercept = 0.5)
dev.off()

pdf("~/work/PAR/results/plots/PAR_genes_depth.male.pdf", height = 10, width =15)
ggplot(subset(Z.depth.mean.combination_wide, chr_start<10000000), aes(x = chr_start, y = male, fill = species)) +
  geom_rect(mapping=aes(xmin=0, xmax=450000, ymin=-Inf, ymax=Inf), fill="grey" , alpha=0.5) +
  geom_point(pch = 21) + facet_wrap(~species) + theme_minimal() +geom_hline(yintercept = 0.5)
dev.off()